from __future__ import with_statement
import re
import os, os.path
import datetime
import time
import csv


# This file searches through the EU press releases for mentions of country names
# and captures the date and the surrounding words.
# To minimize opening files, it opens each file and then searches for all of the
# country names within that file before moving on to the next file.

## get the countrynames from the csvs
csvpath = 'C:/Users/Richard Nielsen/Desktop/Papers/Rewards for Ratification/archive/text/rawdata/csv files with search names/'
csvfilenames = ["ICCPRratdates.csv","CATratdates.csv","Art22ratdates.csv","Op1ratdates.csv"]

## get the names from the first file
allnames = []
for ff in csvfilenames:
    csvpath1 = csvpath + ff
    reader = csv.reader(open(csvpath1,"rb"))
    for row in reader:
        allnames.append(row[0] + "_" + row[2] )

allnames = list(set(allnames))

nms = []
abr = []
for i in allnames:
    nms.append(i.split("_")[0])
    abr.append(i.split("_")[1])
    
abrdict = dict(zip(abr,nms))


## Check that all the search terms are either one word or two
##for a in abr:
##    print len(a.split(" "))
    



## START by changing this to the abrevs list

clist = abr
#clist = ["United States"]

punct = ["\.",",","\?","-","\(","\)","_","\"",":","=","!","\'",
         "<",">","~","`","@","#","$","\n","\r","\|","\+",";"]

##         "/","\!","%","^","&","*","\(","\)","_","\+","=",
##         "<",">","~","`","@","#","$"]



# define a function that looks through the directory and gives all the paths
def filepaths(top_path):
    for dirpath, subdirs, files in os.walk(top_path):
        for f in files:
            yield f, os.path.join(dirpath, f)

# data path
docpath = 'C:/Users/Richard Nielsen/Desktop/Papers/Rewards for Ratification/archive/text/rawdata\\USBriefs'
newpath = 'C:/Users/Richard Nielsen/Desktop/Papers/Rewards for Ratification/archive/text/rawdata\\WordSearchResults'


## make a csv file
mydoc = newpath + "\\USmentions.csv"
f = open(mydoc,'w')
f.writelines('day,month,year,name,veryclose,close,briefname\n')
f.close()


print "starting search"
counter = 0
for name, path in filepaths(docpath):
    counter = counter + 1
    if counter % 1000 ==0:
        print counter
    with open(path) as f:
        text = f.read()
        #brief = text.split("BRIEF:")[1].lower()
        brief = text.lower()
##        ## take out weblinks (Afghanistan is in a ton of weblinks after 2001)
##        if re.search("http",brief):
##            bwords = brief.split(" ")
##            bwords2 = bwords
##            bwcount = -1
##            for bw in bwords2:
##                bwcount = bwcount + 1
##                if re.search("http|rapid&", bw):
##                    del bwords[bwcount]
##            brief = ' '.join(bwords)
##            
        ## start the loop over country names
        for country in clist:
            country_name = abrdict[country]
            country_lower = country.lower()
            if re.search(country_lower, brief):
                ## first get the meta data
                day = name[6:8]
                month = name[4:6]
                year = name[0:4]

                ## Then start getting the word data
                brief = brief.strip()
                ## there are still spaces...
                while re.search("  ",brief):
                    brief = re.sub("  "," ",brief)

                ## Then, get rid of punctuation and grab the nearest words
                for p in punct:
                    brief = re.sub(p,' ',brief)
                
                ## then get rid of doubled whitespaces again
                brief = brief.strip()
                while re.search("  ",brief):
                    brief = re.sub("  "," ",brief)
                
                ## If the country name is one word, then this works,
                if len(country.split(" "))==1:
                    words = brief.split(" ")
                    wcount = -1
                    for w in words:
                        wcount = wcount + 1
                        if re.search(country_lower, w):
                            ## then make the mention data
                            veryclosewords = ' '.join(words[max(wcount - 5,0):wcount] + words[wcount+1:min(wcount + 6,len(words)-1)])
                            closewords = ' '.join(words[max(wcount - 20,0):wcount] + words[wcount+1:min(wcount + 21,len(words)-1)])
                            ## add the words to the csv
                            ## I keep running in to IO errors where the csv won't open
                            try:
                                f = open(mydoc, 'a')
                            except:
                                time.sleep(1)
                                f = open(mydoc, 'a')
                            f.writelines(day + "," + month + "," + year + "," + country_name + ","
                                         + veryclosewords
                                         #+ "," + closewords + "," + name + "\n")
                                         ## Without the large number of words
                                         + ",," + name + "\n")
                            f.close()
                ## If the country name is two words
                if len(country.split(" "))==2:
                    words = brief.split(" ")
                    wcount = -1
                    for ii in range(0,len(words)-2):
                        ## Search over all the bigrams
                        w = words[ii] + " " + words[ii+1]
                        wcount = wcount + 1
                        if re.search(country_lower, w):
                            ## then make the mention data
                            veryclosewords = ' '.join(words[max(wcount - 5,0):wcount] + words[wcount+2:min(wcount + 7,len(words)-1)])
                            closewords = ' '.join(words[max(wcount - 20,0):wcount] + words[wcount+2:min(wcount + 22,len(words)-1)])
                            ## add the words to the csv
                            ## I keep running in to IO errors where the csv won't open
                            try:
                                f = open(mydoc, 'a')
                            except:
                                time.sleep(1)
                                f = open(mydoc, 'a')
                            f.writelines(day + "," + month + "," + year + "," + country_name + ","
                                         + veryclosewords
                                         #+ "," + closewords + "," + name + "\n")
                                         ## Without the large number of words
                                         + ",," + name + "\n")
                            f.close()
                
            


print "done"


            



